load package

library(GGally)
## 载入需要的程辑包:ggplot2
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
library(car)
## 载入需要的程辑包:carData
library(readr)
library(caret)
## 载入需要的程辑包:lattice
library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
## 
## 载入程辑包:'randomForest'
## The following object is masked from 'package:ggplot2':
## 
##     margin
library(ggplot2)
library(gridExtra)
## 
## 载入程辑包:'gridExtra'
## The following object is masked from 'package:randomForest':
## 
##     combine
library(corrplot)
## corrplot 0.92 loaded
library(nnet)
library(rpart)
library(rpart.plot)
library(caret)
library(randomForest)
library(pROC)
## Type 'citation("pROC")' for a citation.
## 
## 载入程辑包:'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
library(MLmetrics)
## 
## 载入程辑包:'MLmetrics'
## The following objects are masked from 'package:caret':
## 
##     MAE, RMSE
## The following object is masked from 'package:base':
## 
##     Recall
library(mgcv)
## 载入需要的程辑包:nlme
## This is mgcv 1.9-0. For overview type 'help("mgcv-package")'.
## 
## 载入程辑包:'mgcv'
## The following object is masked from 'package:nnet':
## 
##     multinom
library(nnet)
library(caret)
library(VGAM)
## 载入需要的程辑包:stats4
## 载入需要的程辑包:splines
## 
## 载入程辑包:'VGAM'
## The following object is masked from 'package:mgcv':
## 
##     s
## The following object is masked from 'package:caret':
## 
##     predictors
## The following object is masked from 'package:car':
## 
##     logit

load data

dat_red_wine <- read_csv("C:\\Users\\Chang\\Downloads\\wineQualityReds.csv", show_col_types = FALSE)

Data Preprocesing

missing values

missing_values <- sapply(dat_red_wine, function(x) sum(is.na(x)))
print(missing_values)
##           id  fix_acidity  vol_acidity  citric_acid        sugar    chlorides 
##            0            0            0            0            0            0 
##  free_sulfur total_sulfur      density           pH    sulphates      alcohol 
##            0            0            0            0            0            0 
##      quality 
##            0

We found that the quality of the data is still relatively high, without any missing data, so we do not need to do much preprocessing for the dataset.

Descriptive Statistics

dat_red_wine <- dat_red_wine[, !(names(dat_red_wine) %in% c("id"))]
summary(dat_red_wine)
##   fix_acidity     vol_acidity      citric_acid        sugar       
##  Min.   : 4.60   Min.   :0.1200   Min.   :0.000   Min.   : 0.900  
##  1st Qu.: 7.10   1st Qu.:0.3900   1st Qu.:0.090   1st Qu.: 1.900  
##  Median : 7.90   Median :0.5200   Median :0.260   Median : 2.200  
##  Mean   : 8.32   Mean   :0.5284   Mean   :0.271   Mean   : 2.539  
##  3rd Qu.: 9.20   3rd Qu.:0.6400   3rd Qu.:0.420   3rd Qu.: 2.600  
##  Max.   :15.90   Max.   :1.5800   Max.   :1.000   Max.   :15.500  
##    chlorides        free_sulfur     total_sulfur       density      
##  Min.   :0.01000   Min.   : 1.00   Min.   :  6.00   Min.   :0.9900  
##  1st Qu.:0.07000   1st Qu.: 7.00   1st Qu.: 22.00   1st Qu.:1.0000  
##  Median :0.08000   Median :14.00   Median : 38.00   Median :1.0000  
##  Mean   :0.08787   Mean   :15.87   Mean   : 46.47   Mean   :0.9985  
##  3rd Qu.:0.09000   3rd Qu.:21.00   3rd Qu.: 62.00   3rd Qu.:1.0000  
##  Max.   :0.61000   Max.   :72.00   Max.   :289.00   Max.   :1.0000  
##        pH          sulphates         alcohol         quality     
##  Min.   :2.740   Min.   :0.3300   Min.   : 8.40   Min.   :3.000  
##  1st Qu.:3.210   1st Qu.:0.5500   1st Qu.: 9.50   1st Qu.:5.000  
##  Median :3.310   Median :0.6200   Median :10.20   Median :6.000  
##  Mean   :3.311   Mean   :0.6581   Mean   :10.42   Mean   :5.636  
##  3rd Qu.:3.400   3rd Qu.:0.7300   3rd Qu.:11.10   3rd Qu.:6.000  
##  Max.   :4.010   Max.   :2.0000   Max.   :14.90   Max.   :8.000
#we need to separate the data set into training data and test data.

# Split the data based on density
data_density_low <- dat_red_wine[dat_red_wine$density == 0.99, ]
data_density_high <- dat_red_wine[dat_red_wine$density == 1, ]

# Create partition for low-density subset
trainIndex_low <- createDataPartition(data_density_low$quality, p = 0.8, list = FALSE)
training_data_low <- data_density_low[trainIndex_low, ]
test_data_low <- data_density_low[-trainIndex_low, ]

# Create partition for high-density subset
trainIndex_high <- createDataPartition(data_density_high$quality, p = 0.8, list = FALSE)
training_data_high <- data_density_high[trainIndex_high, ]
test_data_high <- data_density_high[-trainIndex_high, ]

# Combine the two training datasets and two test datasets
training_data <- rbind(training_data_low, training_data_high)
test_data <- rbind(test_data_low, test_data_high)

training_data$quality <- as.factor(training_data$quality)
test_data$quality <- as.factor(test_data$quality)

It’s easy to miss crucial details. Upon further examination, we found a significant disparity in the distribution of the two values within the ‘density’ variable. Specifically, the value of 0.99 has 239 data points. Given its substantial representation, it’s imperative not to disregard this predictor. Thus, when splitting the dataset into TRAINING DATA and TEST DATA, we must ensure a randomized sampling to maintain a balancedA representation.

examine the variables.

plot_list <- lapply(names(dat_red_wine), function(feature) {
  ggplot(dat_red_wine, aes_string(feature)) +
    geom_histogram(binwidth = (max(dat_red_wine[[feature]]) - min(dat_red_wine[[feature]])) / 30, fill = "blue", alpha = 0.7) +
    labs(title = feature, x = NULL) +
    theme(
      axis.text.y = element_text(size = 10),               # Adjust the y-axis text size
      plot.title = element_text(size = 10, hjust = 0.5),   # Adjust the plot title size and center it
      axis.text.x = element_text(angle = 45, hjust = 1)    # Optionally, rotate x-axis text for better visibility
    )
})
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# Combine all the plots into a matrix
do.call(grid.arrange, c(plot_list, ncol = 3))

# Using ggplot2 for Q-Q plots
qq_plot_list <- lapply(names(dat_red_wine), function(feature) {
  ggplot(dat_red_wine, aes(sample = .data[[feature]])) +
    geom_qq() +
    geom_qq_line(color = "red") +
    labs(title = paste("Q-Q Plot for", feature))
})

# Display the Q-Q plots, 3 plots in a row
do.call(grid.arrange, c(qq_plot_list, ncol = 3))

# Compute the correlation matrix
cor_matrix <- cor(dat_red_wine)

# Visualize the correlation matrix using corrplot
corrplot(cor_matrix, method = "circle")

fix_acidity, vol_acidity, and pH have distributions that somewhat resemble a normal distribution, but with slight skews. sugar, chlorides, and total_sulfur are right-skewed. density shows a significant spike at one value, indicating that many data points have similar values.

Model selection

Combined with some of our exploration of the dataset above, we find that there are no missing values in the dataset and the sample size is relatively large, so I feel that we should perhaps not consider the effect of outliers for each variable at the outset. Combined with the fact that the wine quality score (the dependent variable) is a MULTIPLE CLASS CLASSIFICATION, we should probably start by trying a multinomial logistic regression model.

\textcolor{red}{It’s crucial to highlight that the ‘density’ variable possesses only two distinct values: 0.99 and 1. This observation can be made both from the descriptive statistics table and histogram plots. Despite being represented numerically, ‘density’ effectively functions as a binary categorical variable. Consequently, we need to make appropriate adjustments for its representation.}

logistic regression model

# Convert density into a factor with levels "Low" and "High"
training_data$density <- as.factor(ifelse(training_data$density == 0.99, "Low", "High"))
test_data$density <- as.factor(ifelse(test_data$density == 0.99, "Low", "High"))

# Now check the number of "Low" values in both datasets
sum(training_data$density == "Low")
## [1] 193
sum(test_data$density == "Low")
## [1] 46
# Fit the multinomial logistic regression model directly
model_logistic <- nnet::multinom(quality ~ ., data = training_data)
## # weights:  78 (60 variable)
## initial  value 2297.035640 
## iter  10 value 1548.598165
## iter  20 value 1403.717553
## iter  30 value 1273.540439
## iter  40 value 1194.559021
## iter  50 value 1186.417273
## iter  60 value 1184.221869
## iter  70 value 1182.384161
## iter  80 value 1181.674896
## iter  90 value 1181.462187
## iter 100 value 1181.359223
## final  value 1181.359223 
## stopped after 100 iterations
# Summary of the model
summary(model_logistic)
## Call:
## nnet::multinom(formula = quality ~ ., data = training_data)
## 
## Coefficients:
##   (Intercept) fix_acidity vol_acidity citric_acid        sugar chlorides
## 4    7.913758  -0.7617293   -3.993881   2.2513989 -0.000309244 -12.61312
## 5   21.851190  -0.6950431   -6.982684   1.8909467 -0.219482260 -15.19580
## 6   13.723786  -0.5596795   -9.515647   0.7116874 -0.184021368 -18.25162
## 7    2.368481  -0.4483686  -12.228159   0.6874896 -0.053970845 -26.20279
## 8   12.164032  -0.7235198   -6.955719   3.2555632 -0.043471093 -63.00624
##   free_sulfur total_sulfur densityLow         pH  sulphates  alcohol
## 4  -0.2604252    0.1695727 -0.3154050  -2.962708 -0.5822431 1.280722
## 5  -0.2325215    0.1820953 -1.0107277  -5.033877 -1.1560602 1.003868
## 6  -0.2115335    0.1660598 -0.6910471  -5.084535  0.8794702 1.791349
## 7  -0.2084531    0.1555211 -0.5276856  -4.564941  4.0041269 2.439408
## 8  -0.2192503    0.1424396  0.3474192 -10.009061  6.0963141 2.951050
## 
## Std. Errors:
##   (Intercept) fix_acidity vol_acidity citric_acid     sugar chlorides
## 4   4.5084640   0.3447327    2.279481    1.596895 0.2279134 3.0620359
## 5   2.4875949   0.3103174    2.240822    1.245398 0.2132718 1.8939836
## 6   2.3253993   0.3095426    2.266154    1.227436 0.2145596 1.8871560
## 7   3.2899220   0.3159850    2.394850    1.352127 0.2227879 3.2777969
## 8   0.6935292   0.3799003    3.233608    2.547682 0.3454623 0.1077255
##   free_sulfur total_sulfur densityLow       pH sulphates   alcohol
## 4   0.1258937   0.08779871   1.343141 2.233323  2.270663 0.7552035
## 5   0.1235166   0.08750954   1.298650 1.964174  2.020090 0.7349128
## 6   0.1235331   0.08751737   1.303409 1.961003  2.011036 0.7362732
## 7   0.1241689   0.08768562   1.329273 2.055974  2.057470 0.7432192
## 8   0.1318294   0.08925648   1.571043 2.232564  2.532944 0.8130599
## 
## Residual Deviance: 2362.718 
## AIC: 2482.718
# Predict classes on test data
predicted_classes <- predict(model_logistic, test_data)

# Generate confusion matrix
confusion <- table(test_data$quality, predicted_classes)

# Print confusion matrix
print(confusion)
##    predicted_classes
##       3   4   5   6   7   8
##   3   0   0   1   0   0   0
##   4   0   1   4   4   0   0
##   5   0   0 101  37   0   0
##   6   0   0  39  78  11   0
##   7   0   0   2  23  12   0
##   8   0   0   0   2   2   0
# Compute accuracy
accuracy <- sum(diag(confusion)) / sum(confusion)
cat("Accuracy:", accuracy, "\n")
## Accuracy: 0.6056782
# Extract deviance residuals
residuals_deviance <- residuals(model_logistic, type = "deviance")

# Plot residuals
plot(residuals_deviance, main="Deviance Residuals", ylab="Residuals")
abline(h = 0, col = "red")

# Compute VIF for the model
vif_values <- vif(model_logistic)
## Warning in vif.default(model_logistic): No intercept: vifs may not be sensible.
print(vif_values)
##   fix_acidity   vol_acidity   citric_acid         sugar     chlorides 
## -8.713491e+11  1.742763e+14  1.306753e+16 -1.066555e+10 -6.371438e+15 
##   free_sulfur  total_sulfur       density            pH     sulphates 
##  1.112803e+08  5.656376e+07 -8.521890e+13 -1.395222e+15 -1.905526e+15 
##       alcohol 
## -8.209228e+11

The clustering of points away from the zero line indicates potential issues with model fit. Additionally, the warning message indicates an extremely large variance inflation factor (VIF) for some variables, suggesting multicollinearity.

Ridge Regression (L2 regularization):

# Ridge Regression with the 'glmnet' package

library(glmnet)
## 载入需要的程辑包:Matrix
## Loaded glmnet 4.1-8
# Create a model matrix
X_train <- model.matrix(quality ~ . - 1, data=training_data)
y_train <- training_data$quality

X_test <- model.matrix(~ . - 1, data=test_data)


# Fit the ridge regression model for multinomial logistic regression
ridge_model <- cv.glmnet(X_train, y_train, alpha=0, family="multinomial")
## Warning in lognet(xd, is.sparse, ix, jx, y, weights, offset, alpha, nobs, : one
## multinomial or binomial class has fewer than 8 observations; dangerous ground

## Warning in lognet(xd, is.sparse, ix, jx, y, weights, offset, alpha, nobs, : one
## multinomial or binomial class has fewer than 8 observations; dangerous ground
# Best lambda value
best_lambda_ridge <- ridge_model$lambda.min

# Coefficients at best lambda
coef(ridge_model, s=best_lambda_ridge)
## $`3`
## 13 x 1 sparse Matrix of class "dgCMatrix"
##                         1
## (Intercept)  -4.119240385
## fix_acidity   0.017242700
## vol_acidity   2.462875572
## citric_acid  -0.553351454
## sugar         0.023280011
## chlorides     4.635871443
## free_sulfur  -0.005512432
## total_sulfur -0.004165141
## densityHigh  -0.029609949
## densityLow    0.029415268
## pH            0.652371972
## sulphates    -0.650982649
## alcohol      -0.119507011
## 
## $`4`
## 13 x 1 sparse Matrix of class "dgCMatrix"
##                          1
## (Intercept)  -1.8138961252
## fix_acidity  -0.0781583867
## vol_acidity   2.0493773275
## citric_acid  -0.3388665075
## sugar         0.0683067187
## chlorides     2.7814172424
## free_sulfur  -0.0117762943
## total_sulfur  0.0002577603
## densityHigh   0.0669687751
## densityLow   -0.0671448392
## pH            0.8905928227
## sulphates    -0.7872631480
## alcohol      -0.1711149291
## 
## $`5`
## 13 x 1 sparse Matrix of class "dgCMatrix"
##                         1
## (Intercept)  10.850441696
## fix_acidity  -0.082846528
## vol_acidity   0.488763160
## citric_acid   0.238512984
## sugar        -0.085150246
## chlorides     1.836700690
## free_sulfur  -0.001112931
## total_sulfur  0.013192153
## densityHigh   0.335844888
## densityLow   -0.336467340
## pH           -0.494010929
## sulphates    -1.822777051
## alcohol      -0.613254461
## 
## $`6`
## 13 x 1 sparse Matrix of class "dgCMatrix"
##                         1
## (Intercept)   3.706550137
## fix_acidity   0.025194060
## vol_acidity  -1.478855339
## citric_acid  -0.426618999
## sugar        -0.055658392
## chlorides    -0.695094861
## free_sulfur   0.015770853
## total_sulfur -0.001532446
## densityHigh   0.125708020
## densityLow   -0.125115583
## pH           -0.371252913
## sulphates    -0.141844679
## alcohol       0.038582886
## 
## $`7`
## 13 x 1 sparse Matrix of class "dgCMatrix"
##                         1
## (Intercept)  -5.236121196
## fix_acidity   0.087512615
## vol_acidity  -3.014268621
## citric_acid   0.355831590
## sugar         0.034421532
## chlorides    -5.476918321
## free_sulfur   0.008845127
## total_sulfur -0.005918224
## densityHigh  -0.029609157
## densityLow    0.030905518
## pH            0.095189700
## sulphates     2.157300844
## alcohol       0.499475853
## 
## $`8`
## 13 x 1 sparse Matrix of class "dgCMatrix"
##                         1
## (Intercept)  -3.387734127
## fix_acidity   0.031055539
## vol_acidity  -0.507892099
## citric_acid   0.724492387
## sugar         0.014800376
## chlorides    -3.081976195
## free_sulfur  -0.006214323
## total_sulfur -0.001834101
## densityHigh  -0.469302578
## densityLow    0.468406976
## pH           -0.772890653
## sulphates     1.245566683
## alcohol       0.365817663
# Count of each class in training_data
quality_counts_train <- table(training_data$quality)
print(quality_counts_train)
## 
##   3   4   5   6   7   8 
##   9  44 543 510 162  14
# Count of each class in test_data
quality_counts_test <- table(test_data$quality)
print(quality_counts_test)
## 
##   3   4   5   6   7   8 
##   1   9 138 128  37   4
# Fit the ridge regression model for multinomial logistic regression
ridge_model <- cv.glmnet(X_train, y_train, alpha=0, family="multinomial")
## Warning in lognet(xd, is.sparse, ix, jx, y, weights, offset, alpha, nobs, : one
## multinomial or binomial class has fewer than 8 observations; dangerous ground

## Warning in lognet(xd, is.sparse, ix, jx, y, weights, offset, alpha, nobs, : one
## multinomial or binomial class has fewer than 8 observations; dangerous ground

## Warning in lognet(xd, is.sparse, ix, jx, y, weights, offset, alpha, nobs, : one
## multinomial or binomial class has fewer than 8 observations; dangerous ground
# Best lambda value
best_lambda_ridge <- ridge_model$lambda.min

# Coefficients at best lambda
coef(ridge_model, s=best_lambda_ridge)
## $`3`
## 13 x 1 sparse Matrix of class "dgCMatrix"
##                         1
## (Intercept)  -4.119240385
## fix_acidity   0.017242700
## vol_acidity   2.462875572
## citric_acid  -0.553351454
## sugar         0.023280011
## chlorides     4.635871443
## free_sulfur  -0.005512432
## total_sulfur -0.004165141
## densityHigh  -0.029609949
## densityLow    0.029415268
## pH            0.652371972
## sulphates    -0.650982649
## alcohol      -0.119507011
## 
## $`4`
## 13 x 1 sparse Matrix of class "dgCMatrix"
##                          1
## (Intercept)  -1.8138961252
## fix_acidity  -0.0781583867
## vol_acidity   2.0493773275
## citric_acid  -0.3388665075
## sugar         0.0683067187
## chlorides     2.7814172424
## free_sulfur  -0.0117762943
## total_sulfur  0.0002577603
## densityHigh   0.0669687751
## densityLow   -0.0671448392
## pH            0.8905928227
## sulphates    -0.7872631480
## alcohol      -0.1711149291
## 
## $`5`
## 13 x 1 sparse Matrix of class "dgCMatrix"
##                         1
## (Intercept)  10.850441696
## fix_acidity  -0.082846528
## vol_acidity   0.488763160
## citric_acid   0.238512984
## sugar        -0.085150246
## chlorides     1.836700690
## free_sulfur  -0.001112931
## total_sulfur  0.013192153
## densityHigh   0.335844888
## densityLow   -0.336467340
## pH           -0.494010929
## sulphates    -1.822777051
## alcohol      -0.613254461
## 
## $`6`
## 13 x 1 sparse Matrix of class "dgCMatrix"
##                         1
## (Intercept)   3.706550137
## fix_acidity   0.025194060
## vol_acidity  -1.478855339
## citric_acid  -0.426618999
## sugar        -0.055658392
## chlorides    -0.695094861
## free_sulfur   0.015770853
## total_sulfur -0.001532446
## densityHigh   0.125708020
## densityLow   -0.125115583
## pH           -0.371252913
## sulphates    -0.141844679
## alcohol       0.038582886
## 
## $`7`
## 13 x 1 sparse Matrix of class "dgCMatrix"
##                         1
## (Intercept)  -5.236121196
## fix_acidity   0.087512615
## vol_acidity  -3.014268621
## citric_acid   0.355831590
## sugar         0.034421532
## chlorides    -5.476918321
## free_sulfur   0.008845127
## total_sulfur -0.005918224
## densityHigh  -0.029609157
## densityLow    0.030905518
## pH            0.095189700
## sulphates     2.157300844
## alcohol       0.499475853
## 
## $`8`
## 13 x 1 sparse Matrix of class "dgCMatrix"
##                         1
## (Intercept)  -3.387734127
## fix_acidity   0.031055539
## vol_acidity  -0.507892099
## citric_acid   0.724492387
## sugar         0.014800376
## chlorides    -3.081976195
## free_sulfur  -0.006214323
## total_sulfur -0.001834101
## densityHigh  -0.469302578
## densityLow    0.468406976
## pH           -0.772890653
## sulphates     1.245566683
## alcohol       0.365817663
# Fit the lasso regression model for multinomial logistic regression
lasso_model <- cv.glmnet(X_train, y_train, alpha=1, family="multinomial")
## Warning in lognet(xd, is.sparse, ix, jx, y, weights, offset, alpha, nobs, : one
## multinomial or binomial class has fewer than 8 observations; dangerous ground

## Warning in lognet(xd, is.sparse, ix, jx, y, weights, offset, alpha, nobs, : one
## multinomial or binomial class has fewer than 8 observations; dangerous ground

## Warning in lognet(xd, is.sparse, ix, jx, y, weights, offset, alpha, nobs, : one
## multinomial or binomial class has fewer than 8 observations; dangerous ground
# Best lambda value
best_lambda_lasso <- lasso_model$lambda.min

# Coefficients at best lambda
coef(lasso_model, s=best_lambda_lasso)
## $`3`
## 13 x 1 sparse Matrix of class "dgCMatrix"
##                         1
## (Intercept)  -3.942184003
## fix_acidity   .          
## vol_acidity   5.764633535
## citric_acid   .          
## sugar         .          
## chlorides     5.015592872
## free_sulfur   .          
## total_sulfur -0.001308046
## densityHigh   .          
## densityLow    .          
## pH            .          
## sulphates     .          
## alcohol      -0.069302628
## 
## $`4`
## 13 x 1 sparse Matrix of class "dgCMatrix"
##                          1
## (Intercept)  -0.5845775931
## fix_acidity  -0.1259320785
## vol_acidity   2.8643834617
## citric_acid   .           
## sugar         0.0342677145
## chlorides     1.5977038278
## free_sulfur  -0.0004847219
## total_sulfur  .           
## densityHigh   .           
## densityLow    .           
## pH            0.9788398766
## sulphates    -0.3652070199
## alcohol      -0.2080530404
## 
## $`5`
## 13 x 1 sparse Matrix of class "dgCMatrix"
##                          1
## (Intercept)   1.136674e+01
## fix_acidity  -9.362288e-02
## vol_acidity   1.771776e-01
## citric_acid   2.729137e-01
## sugar        -1.164389e-01
## chlorides     1.249552e+00
## free_sulfur   .           
## total_sulfur  1.819630e-02
## densityHigh   3.914318e-01
## densityLow   -1.581539e-13
## pH            .           
## sulphates    -1.740530e+00
## alcohol      -6.734157e-01
## 
## $`6`
## 13 x 1 sparse Matrix of class "dgCMatrix"
##                          1
## (Intercept)   4.184363e+00
## fix_acidity   .           
## vol_acidity  -1.960606e+00
## citric_acid  -3.902247e-01
## sugar        -8.379699e-02
## chlorides    -1.249552e+00
## free_sulfur   2.016941e-02
## total_sulfur  1.884240e-03
## densityHigh   7.486710e-02
## densityLow   -1.120118e-14
## pH           -5.072495e-02
## sulphates     .           
## alcohol       6.930263e-02
## 
## $`7`
## 13 x 1 sparse Matrix of class "dgCMatrix"
##                         1
## (Intercept)  -4.643161232
## fix_acidity   0.046117545
## vol_acidity  -4.287401570
## citric_acid   .          
## sugar         .          
## chlorides    -7.384083240
## free_sulfur   0.013424556
## total_sulfur -0.003438084
## densityHigh   .          
## densityLow    .          
## pH            0.006279827
## sulphates     2.708071911
## alcohol       0.681416808
## 
## $`8`
## 13 x 1 sparse Matrix of class "dgCMatrix"
##                          1
## (Intercept)  -6.381182e+00
## fix_acidity   .           
## vol_acidity  -1.771776e-01
## citric_acid   3.061870e-01
## sugar         .           
## chlorides    -6.604134e+00
## free_sulfur   .           
## total_sulfur  .           
## densityHigh  -6.864334e-01
## densityLow    2.047379e-13
## pH           -1.586055e+00
## sulphates     2.715704e+00
## alcohol       9.746242e-01

From the WARNING results, we identified a significant issue. For the validation and testing phases of our model, our training dataset exhibited problems in both the lasso model and the Ridge Regression model. This stemmed from an insufficient number of observations (fewer than 8) for some categories of our dependent variable, ‘quality’. In line with AI recommendations, one straightforward solution could be to reclassify the QUALITY scores: grouping (3-4), (5-6), and (7-8). However, we’re hesitant about this approach. We noticed that the count disparity between scores 3 and 4 is notable, making a direct merge questionable. Merging them might oversimplify the nuances between these scores. Additionally, while AI offered other methods, they seem more intricate. We’re considering the application of a random forest approach as a potential resolution to this challenge.

random forest

###decision tree
# Fit a decision tree
tree_model <- rpart(quality ~ ., data=training_data, method="class")

# Visualize the tree


rpart.plot(tree_model)

# Predict on test data
tree_predictions <- predict(tree_model, test_data, type="class")

# Evaluate the performance (assuming 'quality' is a factor)
confusionMatrix(tree_predictions, test_data$quality)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  3  4  5  6  7  8
##          3  0  0  0  0  0  0
##          4  0  0  0  0  0  0
##          5  1  6 95 36  1  0
##          6  0  3 43 85 24  2
##          7  0  0  0  7 12  2
##          8  0  0  0  0  0  0
## 
## Overall Statistics
##                                           
##                Accuracy : 0.6057          
##                  95% CI : (0.5495, 0.6598)
##     No Information Rate : 0.4353          
##     P-Value [Acc > NIR] : 8.164e-10       
##                                           
##                   Kappa : 0.3443          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: 3 Class: 4 Class: 5 Class: 6 Class: 7 Class: 8
## Sensitivity          0.000000  0.00000   0.6884   0.6641  0.32432  0.00000
## Specificity          1.000000  1.00000   0.7542   0.6190  0.96786  1.00000
## Pos Pred Value            NaN      NaN   0.6835   0.5414  0.57143      NaN
## Neg Pred Value       0.996845  0.97161   0.7584   0.7313  0.91554  0.98738
## Prevalence           0.003155  0.02839   0.4353   0.4038  0.11672  0.01262
## Detection Rate       0.000000  0.00000   0.2997   0.2681  0.03785  0.00000
## Detection Prevalence 0.000000  0.00000   0.4385   0.4953  0.06625  0.00000
## Balanced Accuracy    0.500000  0.50000   0.7213   0.6416  0.64609  0.50000
set.seed(34)

rf_model <- randomForest(quality ~ ., data=training_data, ntree=100)

# Predict on test data
rf_predictions <- predict(rf_model, test_data)

# Evaluate the performance
confusionMatrix(rf_predictions, test_data$quality)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   3   4   5   6   7   8
##          3   0   0   0   0   0   0
##          4   0   0   0   0   0   0
##          5   1   6 106  30   1   0
##          6   0   3  29  90  13   2
##          7   0   0   3   8  23   2
##          8   0   0   0   0   0   0
## 
## Overall Statistics
##                                           
##                Accuracy : 0.6909          
##                  95% CI : (0.6368, 0.7413)
##     No Information Rate : 0.4353          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.4969          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: 3 Class: 4 Class: 5 Class: 6 Class: 7 Class: 8
## Sensitivity          0.000000  0.00000   0.7681   0.7031  0.62162  0.00000
## Specificity          1.000000  1.00000   0.7877   0.7513  0.95357  1.00000
## Pos Pred Value            NaN      NaN   0.7361   0.6569  0.63889      NaN
## Neg Pred Value       0.996845  0.97161   0.8150   0.7889  0.95018  0.98738
## Prevalence           0.003155  0.02839   0.4353   0.4038  0.11672  0.01262
## Detection Rate       0.000000  0.00000   0.3344   0.2839  0.07256  0.00000
## Detection Prevalence 0.000000  0.00000   0.4543   0.4322  0.11356  0.00000
## Balanced Accuracy    0.500000  0.50000   0.7779   0.7272  0.78760  0.50000

The random forest model has moderate performance with an accuracy of 69% on the test set. The model struggles with predicting some classes (e.g., 3 and 8) possibly due to class imbalances. While the model does relatively well for classes 5 and 6, there is room for improvement, especially for other classes.

feature_importance <- importance(rf_model)

feature_importance_df <- data.frame(
  Feature = row.names(feature_importance),
  Importance = feature_importance[, "MeanDecreaseGini"] # or "MeanDecreaseAccuracy"
)

ggplot(feature_importance_df, aes(x = reorder(Feature, Importance), y = Importance)) +
  geom_bar(stat="identity") +
  coord_flip() +
  labs(title = "Feature Importance from Random Forest", 
       x = "Features", 
       y = "Importance (Mean Decrease in Gini)") +
  theme_minimal()

Alcohol: This feature has the highest importance, meaning that it plays the most significant role in predicting wine quality in the Random Forest model. It might suggest that the alcohol content in the wine is a strong predictor of its quality.

Sulphates: The next important feature. Sulphates can influence the taste and longevity of wine, potentially playing a role in perceived quality.

Vol_acidity (Volatile Acidity): Also has a significant importance. Volatile acidity can affect the wine’s aroma and taste, hence its quality.

Total_sulfur, pH, Fix_acidity (Fixed Acidity), and others follow in descending order of importance. Each of these factors influences wine quality in their unique ways, either by affecting taste, aroma, or other sensory attributes.

Density: Appears to be the least important feature in predicting wine quality. This might suggest that, in the context of the other features, density doesn’t provide much additional information for predicting quality.

improve model — remove least important feature

# Removing least important features from the dataset
training_data <- training_data[, !names(training_data) %in% c('density', 'chlorides', 'free_sulfur')]
test_data <- test_data[, !names(test_data) %in% c('density', 'chlorides', 'free_sulfur')]


# Recreate the Random Forest model using the updated dataset
set.seed(123) # Setting seed for reproducibility

# Assuming 'quality' is the target variable
rf_model_updated <- randomForest(quality ~ ., data = training_data, ntree = 500, importance = TRUE)

# Checking the model's performance on the test set
predictions_updated <- predict(rf_model_updated, newdata = test_data)

# Calculating accuracy
accuracy_updated <- sum(predictions_updated == test_data$quality) / nrow(test_data)

print(paste("Updated Model Accuracy: ", round(accuracy_updated * 100, 2), "%"))
## [1] "Updated Model Accuracy:  73.82 %"
# If you want to view the updated feature importance, you can do the following:
feature_importance_updated <- importance(rf_model_updated)

feature_importance_df_updated <- data.frame(
  Feature = row.names(feature_importance_updated),
  Importance = feature_importance_updated[, "MeanDecreaseGini"]
)

library(ggplot2)

ggplot(feature_importance_df_updated, aes(x = reorder(Feature, Importance), y = Importance)) +
  geom_bar(stat="identity") +
  coord_flip() +
  labs(title = "Updated Feature Importance from Random Forest", 
       x = "Features", 
       y = "Importance (Mean Decrease in Gini)") +
  theme_minimal()

After removing these variables, it still can not get a significant improvement in the model’s accuracy…

grid <- expand.grid(
  mtry = seq(2, ncol(training_data)-1),  # Number of variables randomly sampled as candidates at each split
  splitrule = c("gini", "extratrees"),    # Rule used to guide the recursive partitioning (for ranger method)
  min.node.size = c(1, 3, 5)              # Minimum sum of instance weights needed in a node (for ranger method)
)

# Use caret's train function for tuning
model <- train(
  quality ~ .,                             # Assuming 'quality' is your target column
  data=training_data, 
  method="ranger",                         # Using 'ranger' method as it's similar to randomForest but faster
  trControl=trainControl(
    method="cv",                           # Cross-validation
    number=3,                              # Number of folds
    search="grid"                          # Use grid search
  ),
  tuneGrid=grid
)

print(model)
## Random Forest 
## 
## 1282 samples
##    8 predictor
##    6 classes: '3', '4', '5', '6', '7', '8' 
## 
## No pre-processing
## Resampling: Cross-Validated (3 fold) 
## Summary of sample sizes: 855, 854, 855 
## Resampling results across tuning parameters:
## 
##   mtry  splitrule   min.node.size  Accuracy   Kappa    
##   2     gini        1              0.6653662  0.4578366
##   2     gini        3              0.6567846  0.4442281
##   2     gini        5              0.6630225  0.4538359
##   2     extratrees  1              0.6575598  0.4433934
##   2     extratrees  3              0.6614648  0.4480332
##   2     extratrees  5              0.6622473  0.4482231
##   3     gini        1              0.6606860  0.4537196
##   3     gini        3              0.6583459  0.4486131
##   3     gini        5              0.6606860  0.4522902
##   3     extratrees  1              0.6497698  0.4318936
##   3     extratrees  3              0.6559985  0.4410172
##   3     extratrees  5              0.6591266  0.4449069
##   4     gini        1              0.6591247  0.4515480
##   4     gini        3              0.6630188  0.4576716
##   4     gini        5              0.6614594  0.4546146
##   4     extratrees  1              0.6466436  0.4274608
##   4     extratrees  3              0.6591211  0.4483279
##   4     extratrees  5              0.6567938  0.4432543
##   5     gini        1              0.6497662  0.4369034
##   5     gini        3              0.6575671  0.4486713
##   5     gini        5              0.6552252  0.4449676
##   5     extratrees  1              0.6552142  0.4435003
##   5     extratrees  3              0.6599054  0.4487589
##   5     extratrees  5              0.6560022  0.4433616
##   6     gini        1              0.6528814  0.4431086
##   6     gini        3              0.6567865  0.4482073
##   6     gini        5              0.6552270  0.4455309
##   6     extratrees  1              0.6528814  0.4391911
##   6     extratrees  3              0.6505359  0.4344729
##   6     extratrees  5              0.6583441  0.4471615
##   7     gini        1              0.6544464  0.4456885
##   7     gini        3              0.6614612  0.4566907
##   7     gini        5              0.6544445  0.4444944
##   7     extratrees  1              0.6591174  0.4506996
##   7     extratrees  3              0.6528833  0.4386249
##   7     extratrees  5              0.6544391  0.4402927
##   8     gini        1              0.6591211  0.4520063
##   8     gini        3              0.6567901  0.4490158
##   8     gini        5              0.6552215  0.4462524
##   8     extratrees  1              0.6544336  0.4437254
##   8     extratrees  3              0.6598999  0.4510928
##   8     extratrees  5              0.6575708  0.4465953
## 
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were mtry = 2, splitrule = gini
##  and min.node.size = 1.
predictions <- predict(model, newdata=test_data)

# Calculate accuracy
accuracy <- sum(predictions == test_data$quality) / nrow(test_data)
print(paste("Accuracy:", accuracy))
## [1] "Accuracy: 0.725552050473186"

The Random Forest model’s accuracy of approximately 69% is reasonable but may not be optimal. This depends on the domain and the acceptable accuracy thresholds. In some domains, 65% might be very good, while in others, it might be considered low.The model struggles with predicting the extreme classes (‘3’ and ‘8’). It might be due to an imbalance in the dataset. If there are very few samples of these classes, the model might struggle to learn their patterns.

Spline:

Based on the correlation matrix images and histograms above, we decided to prioritize the construction of the splines based on the variables vol_acidity and alcohol (which are strongly correlated with the dependent variable quality and whose histograms exhibit some bimodal characteristics, indicating that these predictors are most likely to exhibit a nonlinear relationship with the dependent variable).

Fit the VGAM Model: Use the vglm function to fit the model. In this example, we’ll use spline terms for vol_acidity and alcohol since you mentioned these variables previously:

vgam_model <- vglm(quality ~ s(vol_acidity) + s(alcohol), 
                   family = multinomial(), 
                   data = training_data)
summary(vgam_model)
## 
## Call:
## vglm(formula = quality ~ s(vol_acidity) + s(alcohol), family = multinomial(), 
##     data = training_data)
## 
## Coefficients: 
##                  Estimate Std. Error z value Pr(>|z|)    
## (Intercept):1     16.4635     5.9337   2.775 0.005527 ** 
## (Intercept):2     13.7776     3.5974   3.830 0.000128 ***
## (Intercept):3     24.6307     3.1979   7.702 1.34e-14 ***
## (Intercept):4     16.3638     3.1098   5.262 1.43e-07 ***
## (Intercept):5      9.8416     3.1244      NA       NA    
## s(vol_acidity):1  11.2820     2.4130   4.676 2.93e-06 ***
## s(vol_acidity):2   7.2018     1.9837   3.630 0.000283 ***
## s(vol_acidity):3   3.9612     1.8496   2.142 0.032221 *  
## s(vol_acidity):4   1.3166     1.8262   0.721 0.470936    
## s(vol_acidity):5  -2.2841     1.8786  -1.216 0.224042    
## s(alcohol):1      -2.2430     0.5930  -3.782 0.000155 ***
## s(alcohol):2      -1.4991     0.3022  -4.961 7.00e-07 ***
## s(alcohol):3      -2.1240     0.2585  -8.217  < 2e-16 ***
## s(alcohol):4      -1.1806     0.2475  -4.771 1.83e-06 ***
## s(alcohol):5      -0.5457     0.2472  -2.208 0.027269 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Number of linear predictors:  5 
## 
## Names of linear predictors: log(mu[,1]/mu[,6]), log(mu[,2]/mu[,6]), 
## log(mu[,3]/mu[,6]), log(mu[,4]/mu[,6]), log(mu[,5]/mu[,6])
## 
## Residual deviance: 2528.709 on 6395 degrees of freedom
## 
## Log-likelihood: -1264.354 on 6395 degrees of freedom
## 
## Number of Fisher scoring iterations: 8 
## 
## Warning: Hauck-Donner effect detected in the following estimate(s):
## '(Intercept):3', '(Intercept):4', '(Intercept):5', 's(alcohol):1', 's(alcohol):2'
## 
## 
## Reference group is level  6  of the response

Prediction and Evaluation

predictions <- predict(vgam_model, newdata = test_data, type = "response")
predicted_classes <- apply(predictions, 1, which.max)
predicted_classes <- factor(predicted_classes, levels = 1:nlevels(training_data$quality), labels = levels(training_data$quality))

# Assuming test_data$quality is available for comparison
confusionMatrix(predicted_classes, test_data$quality)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   3   4   5   6   7   8
##          3   0   0   0   0   0   0
##          4   0   0   0   0   0   0
##          5   1   9 101  47   2   0
##          6   0   0  37  76  26   2
##          7   0   0   0   5   9   2
##          8   0   0   0   0   0   0
## 
## Overall Statistics
##                                           
##                Accuracy : 0.5868          
##                  95% CI : (0.5304, 0.6415)
##     No Information Rate : 0.4353          
##     P-Value [Acc > NIR] : 4.337e-08       
##                                           
##                   Kappa : 0.3052          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: 3 Class: 4 Class: 5 Class: 6 Class: 7 Class: 8
## Sensitivity          0.000000  0.00000   0.7319   0.5938  0.24324  0.00000
## Specificity          1.000000  1.00000   0.6704   0.6561  0.97500  1.00000
## Pos Pred Value            NaN      NaN   0.6312   0.5390  0.56250      NaN
## Neg Pred Value       0.996845  0.97161   0.7643   0.7045  0.90698  0.98738
## Prevalence           0.003155  0.02839   0.4353   0.4038  0.11672  0.01262
## Detection Rate       0.000000  0.00000   0.3186   0.2397  0.02839  0.00000
## Detection Prevalence 0.000000  0.00000   0.5047   0.4448  0.05047  0.00000
## Balanced Accuracy    0.500000  0.50000   0.7011   0.6249  0.60912  0.50000
# Calculate and plot residuals
residuals <- residuals(vgam_model, type = "response")
plot(residuals ~ fitted(vgam_model), xlab = "Fitted Values", ylab = "Residuals")

AIC(vgam_model)
## [1] 2558.709
BIC(vgam_model)
## [1] 2636.052

The funnel-like shape, where the residuals are more spread out for the middle range of fitted values, suggests heteroscedasticity – the variance of the residuals is not constant. In the context of a multinomial model, this could mean that the probabilities predicted by the model for certain outcome categories are less consistent for certain ranges of predictors. ### Examine the Effect of Spline Complexity:

# Fit models with different degrees of freedom for splines
models <- list()
df_values <- c(3, 4, 5)

for (df in df_values) {
    model_name <- paste("Model_DF", df, sep = "_")
    models[[model_name]] <- vglm(quality ~ s(vol_acidity, df = df) + s(alcohol, df = df), 
                                 family = multinomial(), 
                                 data = training_data)
}

# Compare models using AIC and BIC
aic_values <- sapply(models, AIC)
bic_values <- sapply(models, BIC)

# Print the AIC and BIC values for each model
aic_values
## Model_DF_3 Model_DF_4 Model_DF_5 
##   2558.709   2558.709   2558.709
bic_values
## Model_DF_3 Model_DF_4 Model_DF_5 
##   2636.052   2636.052   2636.052
# Model summaries
lapply(models, summary)
## $Model_DF_3
## 
## Call:
## vglm(formula = quality ~ s(vol_acidity, df = df) + s(alcohol, 
##     df = df), family = multinomial(), data = training_data)
## 
## Coefficients: 
##                           Estimate Std. Error z value Pr(>|z|)    
## (Intercept):1              16.4635     5.9337   2.775 0.005527 ** 
## (Intercept):2              13.7776     3.5974   3.830 0.000128 ***
## (Intercept):3              24.6307     3.1979   7.702 1.34e-14 ***
## (Intercept):4              16.3638     3.1098   5.262 1.43e-07 ***
## (Intercept):5               9.8416     3.1244      NA       NA    
## s(vol_acidity, df = df):1  11.2820     2.4130   4.676 2.93e-06 ***
## s(vol_acidity, df = df):2   7.2018     1.9837   3.630 0.000283 ***
## s(vol_acidity, df = df):3   3.9612     1.8496   2.142 0.032221 *  
## s(vol_acidity, df = df):4   1.3166     1.8262   0.721 0.470936    
## s(vol_acidity, df = df):5  -2.2841     1.8786  -1.216 0.224042    
## s(alcohol, df = df):1      -2.2430     0.5930  -3.782 0.000155 ***
## s(alcohol, df = df):2      -1.4991     0.3022  -4.961 7.00e-07 ***
## s(alcohol, df = df):3      -2.1240     0.2585  -8.217  < 2e-16 ***
## s(alcohol, df = df):4      -1.1806     0.2475  -4.771 1.83e-06 ***
## s(alcohol, df = df):5      -0.5457     0.2472  -2.208 0.027269 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Number of linear predictors:  5 
## 
## Names of linear predictors: log(mu[,1]/mu[,6]), log(mu[,2]/mu[,6]), 
## log(mu[,3]/mu[,6]), log(mu[,4]/mu[,6]), log(mu[,5]/mu[,6])
## 
## Residual deviance: 2528.709 on 6395 degrees of freedom
## 
## Log-likelihood: -1264.354 on 6395 degrees of freedom
## 
## Number of Fisher scoring iterations: 8 
## 
## Warning: Hauck-Donner effect detected in the following estimate(s):
## '(Intercept):3', '(Intercept):4', '(Intercept):5', 's(alcohol, df = df):1', 's(alcohol, df = df):2'
## 
## 
## Reference group is level  6  of the response
## 
## $Model_DF_4
## 
## Call:
## vglm(formula = quality ~ s(vol_acidity, df = df) + s(alcohol, 
##     df = df), family = multinomial(), data = training_data)
## 
## Coefficients: 
##                           Estimate Std. Error z value Pr(>|z|)    
## (Intercept):1              16.4635     5.9337   2.775 0.005527 ** 
## (Intercept):2              13.7776     3.5974   3.830 0.000128 ***
## (Intercept):3              24.6307     3.1979   7.702 1.34e-14 ***
## (Intercept):4              16.3638     3.1098   5.262 1.43e-07 ***
## (Intercept):5               9.8416     3.1244      NA       NA    
## s(vol_acidity, df = df):1  11.2820     2.4130   4.676 2.93e-06 ***
## s(vol_acidity, df = df):2   7.2018     1.9837   3.630 0.000283 ***
## s(vol_acidity, df = df):3   3.9612     1.8496   2.142 0.032221 *  
## s(vol_acidity, df = df):4   1.3166     1.8262   0.721 0.470936    
## s(vol_acidity, df = df):5  -2.2841     1.8786  -1.216 0.224042    
## s(alcohol, df = df):1      -2.2430     0.5930  -3.782 0.000155 ***
## s(alcohol, df = df):2      -1.4991     0.3022  -4.961 7.00e-07 ***
## s(alcohol, df = df):3      -2.1240     0.2585  -8.217  < 2e-16 ***
## s(alcohol, df = df):4      -1.1806     0.2475  -4.771 1.83e-06 ***
## s(alcohol, df = df):5      -0.5457     0.2472  -2.208 0.027269 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Number of linear predictors:  5 
## 
## Names of linear predictors: log(mu[,1]/mu[,6]), log(mu[,2]/mu[,6]), 
## log(mu[,3]/mu[,6]), log(mu[,4]/mu[,6]), log(mu[,5]/mu[,6])
## 
## Residual deviance: 2528.709 on 6395 degrees of freedom
## 
## Log-likelihood: -1264.354 on 6395 degrees of freedom
## 
## Number of Fisher scoring iterations: 8 
## 
## Warning: Hauck-Donner effect detected in the following estimate(s):
## '(Intercept):3', '(Intercept):4', '(Intercept):5', 's(alcohol, df = df):1', 's(alcohol, df = df):2'
## 
## 
## Reference group is level  6  of the response
## 
## $Model_DF_5
## 
## Call:
## vglm(formula = quality ~ s(vol_acidity, df = df) + s(alcohol, 
##     df = df), family = multinomial(), data = training_data)
## 
## Coefficients: 
##                           Estimate Std. Error z value Pr(>|z|)    
## (Intercept):1              16.4635     5.9337   2.775 0.005527 ** 
## (Intercept):2              13.7776     3.5974   3.830 0.000128 ***
## (Intercept):3              24.6307     3.1979   7.702 1.34e-14 ***
## (Intercept):4              16.3638     3.1098   5.262 1.43e-07 ***
## (Intercept):5               9.8416     3.1244      NA       NA    
## s(vol_acidity, df = df):1  11.2820     2.4130   4.676 2.93e-06 ***
## s(vol_acidity, df = df):2   7.2018     1.9837   3.630 0.000283 ***
## s(vol_acidity, df = df):3   3.9612     1.8496   2.142 0.032221 *  
## s(vol_acidity, df = df):4   1.3166     1.8262   0.721 0.470936    
## s(vol_acidity, df = df):5  -2.2841     1.8786  -1.216 0.224042    
## s(alcohol, df = df):1      -2.2430     0.5930  -3.782 0.000155 ***
## s(alcohol, df = df):2      -1.4991     0.3022  -4.961 7.00e-07 ***
## s(alcohol, df = df):3      -2.1240     0.2585  -8.217  < 2e-16 ***
## s(alcohol, df = df):4      -1.1806     0.2475  -4.771 1.83e-06 ***
## s(alcohol, df = df):5      -0.5457     0.2472  -2.208 0.027269 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Number of linear predictors:  5 
## 
## Names of linear predictors: log(mu[,1]/mu[,6]), log(mu[,2]/mu[,6]), 
## log(mu[,3]/mu[,6]), log(mu[,4]/mu[,6]), log(mu[,5]/mu[,6])
## 
## Residual deviance: 2528.709 on 6395 degrees of freedom
## 
## Log-likelihood: -1264.354 on 6395 degrees of freedom
## 
## Number of Fisher scoring iterations: 8 
## 
## Warning: Hauck-Donner effect detected in the following estimate(s):
## '(Intercept):3', '(Intercept):4', '(Intercept):5', 's(alcohol, df = df):1', 's(alcohol, df = df):2'
## 
## 
## Reference group is level  6  of the response
best_df <- df_values[which.min(aic_values)]
best_model <- models[[paste("Model_DF", best_df, sep = "_")]]
summary(best_model)
## 
## Call:
## vglm(formula = quality ~ s(vol_acidity, df = df) + s(alcohol, 
##     df = df), family = multinomial(), data = training_data)
## 
## Coefficients: 
##                           Estimate Std. Error z value Pr(>|z|)    
## (Intercept):1              16.4635     5.9337   2.775 0.005527 ** 
## (Intercept):2              13.7776     3.5974   3.830 0.000128 ***
## (Intercept):3              24.6307     3.1979   7.702 1.34e-14 ***
## (Intercept):4              16.3638     3.1098   5.262 1.43e-07 ***
## (Intercept):5               9.8416     3.1244      NA       NA    
## s(vol_acidity, df = df):1  11.2820     2.4130   4.676 2.93e-06 ***
## s(vol_acidity, df = df):2   7.2018     1.9837   3.630 0.000283 ***
## s(vol_acidity, df = df):3   3.9612     1.8496   2.142 0.032221 *  
## s(vol_acidity, df = df):4   1.3166     1.8262   0.721 0.470936    
## s(vol_acidity, df = df):5  -2.2841     1.8786  -1.216 0.224042    
## s(alcohol, df = df):1      -2.2430     0.5930  -3.782 0.000155 ***
## s(alcohol, df = df):2      -1.4991     0.3022  -4.961 7.00e-07 ***
## s(alcohol, df = df):3      -2.1240     0.2585  -8.217  < 2e-16 ***
## s(alcohol, df = df):4      -1.1806     0.2475  -4.771 1.83e-06 ***
## s(alcohol, df = df):5      -0.5457     0.2472  -2.208 0.027269 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Number of linear predictors:  5 
## 
## Names of linear predictors: log(mu[,1]/mu[,6]), log(mu[,2]/mu[,6]), 
## log(mu[,3]/mu[,6]), log(mu[,4]/mu[,6]), log(mu[,5]/mu[,6])
## 
## Residual deviance: 2528.709 on 6395 degrees of freedom
## 
## Log-likelihood: -1264.354 on 6395 degrees of freedom
## 
## Number of Fisher scoring iterations: 8 
## 
## Warning: Hauck-Donner effect detected in the following estimate(s):
## '(Intercept):3', '(Intercept):4', '(Intercept):5', 's(alcohol, df = df):1', 's(alcohol, df = df):2'
## 
## 
## Reference group is level  6  of the response
# Plot the splines
plot(best_model, select = 1)  # vol_acidity spline
## Warning in plot.window(...): "select"不是图形参数
## Warning in plot.xy(xy, type, ...): "select"不是图形参数
## Warning in axis(side = side, at = at, labels = labels, ...):
## "select"不是图形参数

## Warning in axis(side = side, at = at, labels = labels, ...):
## "select"不是图形参数
## Warning in box(...): "select"不是图形参数
## Warning in title(...): "select"不是图形参数

## Warning in plot.window(...): "select"不是图形参数
## Warning in plot.xy(xy, type, ...): "select"不是图形参数
## Warning in axis(side = side, at = at, labels = labels, ...):
## "select"不是图形参数

## Warning in axis(side = side, at = at, labels = labels, ...):
## "select"不是图形参数
## Warning in box(...): "select"不是图形参数
## Warning in title(...): "select"不是图形参数

## Warning in plot.window(...): "select"不是图形参数
## Warning in plot.xy(xy, type, ...): "select"不是图形参数
## Warning in axis(side = side, at = at, labels = labels, ...):
## "select"不是图形参数

## Warning in axis(side = side, at = at, labels = labels, ...):
## "select"不是图形参数
## Warning in box(...): "select"不是图形参数
## Warning in title(...): "select"不是图形参数

## Warning in plot.window(...): "select"不是图形参数
## Warning in plot.xy(xy, type, ...): "select"不是图形参数
## Warning in axis(side = side, at = at, labels = labels, ...):
## "select"不是图形参数

## Warning in axis(side = side, at = at, labels = labels, ...):
## "select"不是图形参数
## Warning in box(...): "select"不是图形参数
## Warning in title(...): "select"不是图形参数

## Warning in plot.window(...): "select"不是图形参数
## Warning in plot.xy(xy, type, ...): "select"不是图形参数
## Warning in axis(side = side, at = at, labels = labels, ...):
## "select"不是图形参数

## Warning in axis(side = side, at = at, labels = labels, ...):
## "select"不是图形参数
## Warning in box(...): "select"不是图形参数
## Warning in title(...): "select"不是图形参数

## Warning in plot.window(...): "select"不是图形参数
## Warning in plot.xy(xy, type, ...): "select"不是图形参数
## Warning in axis(side = side, at = at, labels = labels, ...):
## "select"不是图形参数

## Warning in axis(side = side, at = at, labels = labels, ...):
## "select"不是图形参数
## Warning in box(...): "select"不是图形参数
## Warning in title(...): "select"不是图形参数

## Warning in plot.window(...): "select"不是图形参数
## Warning in plot.xy(xy, type, ...): "select"不是图形参数
## Warning in axis(side = side, at = at, labels = labels, ...):
## "select"不是图形参数

## Warning in axis(side = side, at = at, labels = labels, ...):
## "select"不是图形参数
## Warning in box(...): "select"不是图形参数
## Warning in title(...): "select"不是图形参数

## Warning in plot.window(...): "select"不是图形参数
## Warning in plot.xy(xy, type, ...): "select"不是图形参数
## Warning in axis(side = side, at = at, labels = labels, ...):
## "select"不是图形参数

## Warning in axis(side = side, at = at, labels = labels, ...):
## "select"不是图形参数
## Warning in box(...): "select"不是图形参数
## Warning in title(...): "select"不是图形参数

## Warning in plot.window(...): "select"不是图形参数
## Warning in plot.xy(xy, type, ...): "select"不是图形参数
## Warning in axis(side = side, at = at, labels = labels, ...):
## "select"不是图形参数

## Warning in axis(side = side, at = at, labels = labels, ...):
## "select"不是图形参数
## Warning in box(...): "select"不是图形参数
## Warning in title(...): "select"不是图形参数

## Warning in plot.window(...): "select"不是图形参数
## Warning in plot.xy(xy, type, ...): "select"不是图形参数
## Warning in axis(side = side, at = at, labels = labels, ...):
## "select"不是图形参数

## Warning in axis(side = side, at = at, labels = labels, ...):
## "select"不是图形参数
## Warning in box(...): "select"不是图形参数
## Warning in title(...): "select"不是图形参数

plot(best_model, select = 2)  # alcohol spline
## Warning in plot.window(...): "select"不是图形参数
## Warning in plot.xy(xy, type, ...): "select"不是图形参数
## Warning in axis(side = side, at = at, labels = labels, ...):
## "select"不是图形参数

## Warning in axis(side = side, at = at, labels = labels, ...):
## "select"不是图形参数
## Warning in box(...): "select"不是图形参数
## Warning in title(...): "select"不是图形参数

## Warning in plot.window(...): "select"不是图形参数
## Warning in plot.xy(xy, type, ...): "select"不是图形参数
## Warning in axis(side = side, at = at, labels = labels, ...):
## "select"不是图形参数

## Warning in axis(side = side, at = at, labels = labels, ...):
## "select"不是图形参数
## Warning in box(...): "select"不是图形参数
## Warning in title(...): "select"不是图形参数

## Warning in plot.window(...): "select"不是图形参数
## Warning in plot.xy(xy, type, ...): "select"不是图形参数
## Warning in axis(side = side, at = at, labels = labels, ...):
## "select"不是图形参数

## Warning in axis(side = side, at = at, labels = labels, ...):
## "select"不是图形参数
## Warning in box(...): "select"不是图形参数
## Warning in title(...): "select"不是图形参数

## Warning in plot.window(...): "select"不是图形参数
## Warning in plot.xy(xy, type, ...): "select"不是图形参数
## Warning in axis(side = side, at = at, labels = labels, ...):
## "select"不是图形参数

## Warning in axis(side = side, at = at, labels = labels, ...):
## "select"不是图形参数
## Warning in box(...): "select"不是图形参数
## Warning in title(...): "select"不是图形参数

## Warning in plot.window(...): "select"不是图形参数
## Warning in plot.xy(xy, type, ...): "select"不是图形参数
## Warning in axis(side = side, at = at, labels = labels, ...):
## "select"不是图形参数

## Warning in axis(side = side, at = at, labels = labels, ...):
## "select"不是图形参数
## Warning in box(...): "select"不是图形参数
## Warning in title(...): "select"不是图形参数

## Warning in plot.window(...): "select"不是图形参数
## Warning in plot.xy(xy, type, ...): "select"不是图形参数
## Warning in axis(side = side, at = at, labels = labels, ...):
## "select"不是图形参数

## Warning in axis(side = side, at = at, labels = labels, ...):
## "select"不是图形参数
## Warning in box(...): "select"不是图形参数
## Warning in title(...): "select"不是图形参数

## Warning in plot.window(...): "select"不是图形参数
## Warning in plot.xy(xy, type, ...): "select"不是图形参数
## Warning in axis(side = side, at = at, labels = labels, ...):
## "select"不是图形参数

## Warning in axis(side = side, at = at, labels = labels, ...):
## "select"不是图形参数
## Warning in box(...): "select"不是图形参数
## Warning in title(...): "select"不是图形参数

## Warning in plot.window(...): "select"不是图形参数
## Warning in plot.xy(xy, type, ...): "select"不是图形参数
## Warning in axis(side = side, at = at, labels = labels, ...):
## "select"不是图形参数

## Warning in axis(side = side, at = at, labels = labels, ...):
## "select"不是图形参数
## Warning in box(...): "select"不是图形参数
## Warning in title(...): "select"不是图形参数

## Warning in plot.window(...): "select"不是图形参数
## Warning in plot.xy(xy, type, ...): "select"不是图形参数
## Warning in axis(side = side, at = at, labels = labels, ...):
## "select"不是图形参数

## Warning in axis(side = side, at = at, labels = labels, ...):
## "select"不是图形参数
## Warning in box(...): "select"不是图形参数
## Warning in title(...): "select"不是图形参数

## Warning in plot.window(...): "select"不是图形参数
## Warning in plot.xy(xy, type, ...): "select"不是图形参数
## Warning in axis(side = side, at = at, labels = labels, ...):
## "select"不是图形参数

## Warning in axis(side = side, at = at, labels = labels, ...):
## "select"不是图形参数
## Warning in box(...): "select"不是图形参数
## Warning in title(...): "select"不是图形参数

# Plot residuals
residuals <- residuals(best_model)
plot(residuals)

independent_vars <- training_data[, !(names(training_data) %in% c("quality"))]


# Standardize the independent variables
independent_vars_scaled <- scale(independent_vars)

# Perform PCA
pca_result <- prcomp(independent_vars_scaled, center = TRUE, scale. = TRUE)

# Summary of PCA
summary(pca_result)
## Importance of components:
##                           PC1    PC2    PC3    PC4    PC5     PC6     PC7
## Standard deviation     1.6357 1.1755 1.0432 0.9792 0.8667 0.76597 0.58888
## Proportion of Variance 0.3344 0.1727 0.1360 0.1199 0.0939 0.07334 0.04335
## Cumulative Proportion  0.3344 0.5072 0.6432 0.7631 0.8570 0.93030 0.97364
##                            PC8
## Standard deviation     0.45918
## Proportion of Variance 0.02636
## Cumulative Proportion  1.00000
plot(pca_result, type = "l")  # Scree plot

biplot(pca_result)

pca_data <- as.data.frame(pca_result$x[, 1:3])

# For example, if using in a clustering algorithm
clusters <- kmeans(pca_data, centers = 3)

pca_data$quality <- training_data$quality
pca_model <- lm(quality ~ ., data = pca_data)
## Warning in model.response(mf, "numeric"):
## 在因子响应上用type="numeric"的这一选项不会有效果
## Warning in Ops.factor(y, z$residuals): '-' not meaningful for factors
library(ggplot2)
ggplot(pca_data, aes(PC1, PC2, color = quality)) + geom_point()

library(scatterplot3d)

# Assuming pca_data is a data frame with the first three principal components and the quality variable
pca_data <- as.data.frame(pca_result$x[, 1:3])
pca_data$quality <- training_data$quality  # Make sure to match the quality with its principal components

# Create a 3D scatter plot
scatterplot3d(x = pca_data$PC1, y = pca_data$PC2, z = pca_data$PC3, 
              color = as.factor(pca_data$quality), pch = 19,
              xlab = "PC1", ylab = "PC2", zlab = "PC3")

# Adding a legend, if needed
legend("topright", legend = levels(factor(pca_data$quality)), col = 1:length(levels(factor(pca_data$quality))), pch = 19)

library(plotly)
## 
## 载入程辑包:'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
# Assuming pca_data is a data frame with the first three principal components and the quality variable
pca_data <- as.data.frame(pca_result$x[, 1:3])
pca_data$quality <- as.factor(training_data$quality)  # Make sure to match the quality with its principal components

# Create an interactive 3D scatter plot
fig <- plot_ly(data = pca_data, x = ~PC1, y = ~PC2, z = ~PC3, 
               color = ~quality, colors = RColorBrewer::brewer.pal(length(unique(pca_data$quality)), "Set1"),
               type = "scatter3d", mode = "markers")

# Add axes labels
fig <- fig %>% layout(scene = list(xaxis = list(title = 'PC1'),
                                   yaxis = list(title = 'PC2'),
                                   zaxis = list(title = 'PC3')))

# Render the plot
fig